Set up

using <- function(...) {
    libs <- unlist(list(...))
    need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
    if(length(need) > 0){ 
        install.packages(need, repos = "https://cloud.r-project.org")
        need <- need[!unlist(lapply(need, require, character.only = T))]
        if (length(need) > 0) {
          if (!requireNamespace("BiocManager", quietly = T))
            install.packages("BiocManager")
          BiocManager::install(need)
        }
    }
    lapply(libs, require, character.only = T)
}
using("tidyverse", "DESeq2", "apeglm", "gprofiler2", "fgsea", "msigdbr", "org.Hs.eg.db",
      "limma", "patchwork", "ggrepel", "reactable", "shiny", "tippy", "ggdist", "pheatmap")

Part 1


Import

# load data
load('lec6_1.rda')

# create DESeq object
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata, design = ~condition)

# what samples are we working with?
coldata
condition
ctrl1 ctrl
ctrl2 ctrl
ctrl3 ctrl
ctrl4 ctrl
ctrl5 ctrl
t1d1 t1d
t1d2 t1d
t1d3 t1d

Run analysis

How does normalization show up on the MA plot?

dds <- DESeq(dds)

par(mfrow = c(1, 2))
limma::plotMA(log2(counts(dds) + 1), array = 8, main = "raw")
abline(h = 0, col = "grey")
limma::plotMA(log2(counts(dds, normalized = T)  + 1), array = 8, main = "normalized")
abline(h = 0, col = "grey")


Normalize

What is the RLE normalization doing?

ref <- rowMeans(log(cts))
colnames(cts) %>%
  lapply(function(s) {
    cnts <- cts[,s]
    tibble(x = exp((log(cnts) - ref)[is.finite(ref) & cnts > 0]),
           y = s)
  }) %>%
  bind_rows() %>%
  ggplot(aes(x, y)) +
  stat_histinterval(breaks = 100) +
  scale_x_continuous(trans = 'log', breaks = c(.05,1,20)) +
  annotation_logticks(sides = 'b') +
  labs(x = 'Ratio', y = 'Sample')


Get results

res <- results(dds)
head(res)
## log2 fold change (MLE): condition t1d vs ctrl 
## Wald test p-value: condition t1d vs ctrl 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000223972  0.186742      -1.650058   3.64464 -0.452735  0.650739
## ENSG00000227232  5.733825       0.497553   1.02417  0.485813  0.627100
## ENSG00000278267  1.101126      -2.080596   3.39879 -0.612158  0.540433
## ENSG00000243485  0.451844       1.561078   3.54424  0.440455  0.659607
## ENSG00000284332  0.000000             NA        NA        NA        NA
## ENSG00000237613  0.000000             NA        NA        NA        NA
##                      padj
##                 <numeric>
## ENSG00000223972        NA
## ENSG00000227232  0.855427
## ENSG00000278267        NA
## ENSG00000243485        NA
## ENSG00000284332        NA
## ENSG00000237613        NA

Shrink fold change estimate

resLFC <- lfcShrink(dds, coef = 2, type = "apeglm", res = res)

# what does coef = 2 mean?
resultsNames(dds)
## [1] "Intercept"             "condition_t1d_vs_ctrl"

Compare effect of shrinkage

MA

par(mfrow = c(1, 2))
DESeq2::plotMA(res, ylim = c(-5, 5), maiin = 'Original')
DESeq2::plotMA(resLFC, ylim = c(-5, 5), main = 'Shrunken')

Volcano

volc <- function(r, x, y, ttl) {
  d <- as.data.frame(r) %>%
    dplyr::rename(x = !!x, y = !!y) %>%
    mutate(kind = case_when(abs(x) > 2 & y < .01 ~ 'DE',
                            abs(x) > 2 ~ 'big',
                            y < .01 ~ 'sig',
                            T ~ 'NS'),
           y = -log10(y))
  ct <- na.omit(d) %>% 
    dplyr::filter(kind == 'DE') %>%
    mutate(up = x > 0) %>%
    dplyr::count(up) %>%
    mutate(x = ifelse(up, Inf, -Inf),
           y = Inf,
           h = as.numeric(up))
  ggplot(d, aes(x, y, color = kind)) +
    geom_vline(xintercept = c(-2, 2), linetype = 'dashed') +
    geom_hline(yintercept = 2, linetype = 'dashed') +
    geom_point(alpha = .5) +
    geom_label(aes(x = x, y = y, label = n, hjust = h),
              vjust = 1, data = ct, inherit.aes = F) +
    scale_y_continuous() +
    scale_color_manual(values = c('forestgreen', 'red2', 'grey30',  'royalblue')) +
    labs(x = x, y = y, title = ttl)
}

volc(res, 'log2FoldChange','padj', 'Original') +
  theme(legend.position = 'none') +
  volc(resLFC, 'log2FoldChange', 'padj', 'Shrunken')


Grouping

Before delving further, let’s make sure the samples we’re comparing for DE make sense with a simple PCA

PCA

rld <- rlog(dds, blind = F)
pd <- plotPCA(rld, intgroup = "condition", ntop = 500, returnData = T)

data.frame(extra) %>%
  rownames_to_column('name') %>% 
  merge(pd) %>%
  ggplot(aes(x = PC1, y = PC2)) +
  geom_point(aes(color = condition)) +
  geom_text(aes(label = name)) +
  labs(x = sprintf('PC1: %.1f%% variance', 100 * attr(pd, 'percentVar')[1]),
       y = sprintf('PC2: %.1f%% variance', 100 * attr(pd, 'percentVar')[2])) +
  coord_fixed()

Hierarchical clustering

assay(rld) %>%
  apply(1, sd) %>%
  order(decreasing = TRUE) %>%
  .[1:100] %>%
  assay(rld)[.,] %>%
  heatmap()

`

  1. Why are we seeing intra-condition segregation? Use the extra metadata contained in extra

`


Design

How can we go about addressing this?

colData(dds) <- cbind(coldata, extra)
colData(dds)$condition <- factor(colData(dds)$condition)
design(dds) <- ~ sex + condition
dds <- DESeq(dds)
res.s <- results(dds)

design(dds) <- ~ sex + age + condition
dds <- DESeq(dds)
res.sa <- results(dds)

`

  1. How does different designs affect the number of DEGs? What about the preferences for the direction of change?

`


Part 2


Import

# clear workspace
rm(list = setdiff(ls(), 'volc'))

# load data
load('lec6_2.rda')

# create deseq dataset object
dds <- DESeqDataSetFromMatrix(countData = round(cts),
                              colData = coldata,
                              design = ~ condition)

# run deseq2
dds <- DESeq(dds)

# output all results
res <- results(dds)

DE

`

  1. Does the most significant DEG look convincing?

`

What about the top 10?

t10 <- data.frame(res) %>%
  rownames_to_column("gene") %>%
  slice_min(padj, n = 10, with_ties = F) %>%
  arrange(padj) %>%
  pull(gene)

counts(dds, normalized = TRUE) %>%
  data.frame() %>%
  rownames_to_column('gene') %>%
  dplyr::filter(gene %in% t10) %>%
  pivot_longer(-gene, names_to = 'samp', values_to = 'ct') %>%
  merge(rownames_to_column(data.frame(coldata), 'samp')) %>%
  mutate(gene = factor(gene, t10)) %>%
  ggplot(aes(x = condition, y = ct, color = condition)) +
  geom_point() +
  scale_y_log10() +
  facet_wrap(.~gene)

`

  1. Use volcano or MA plots and determine if there is prefential up- or down-regulation

`


Pathway over-representation analysis

Run

rd <- data.frame(res) %>%
  na.omit() %>%
  rownames_to_column('gene') %>%
  mutate(gene = sub('\\..*', '', gene)) 
g <- gost(rd$gene[rd$padj < .05 & abs(rd$log2FoldChange) > .5],
          organism = 'hsapiens',
          custom_bg = rd$gene)
g$result %>%
  dplyr::select(source, term_name, term_size, query_size,
                intersection_size, p_value) %>% 
  arrange(source, p_value) %>%
  mutate(p_value = formatC(p_value, digits = 3, format = 'g')) %>%
  reactable()

Summary

gostplot(g)

`

  1. What if you don’t supply the background?

`


Gene set enrichment analysis

We can additionally apply GSEA

# mapping Ensembl IDs to gene symbols
e2s <- AnnotationDbi::select(org.Hs.eg.db,
                             key = rd$gene,
                             columns = "SYMBOL",
                             keytype = "ENSEMBL") %>%
  na.omit() %>%
  deframe()

# use DE test statistic to rank genes
s <- rd %>% 
  mutate(symb = e2s[gene]) %>%
  na.omit() %>% 
  group_by(symb) %>%
  summarise(stat = mean(stat)) %>%
  deframe()

# pathways
p <- msigdbr(species = "Homo sapiens", category = "H") %>%
  mutate(gs_name = sub('^HALLMARK_', '', gs_name)) %>%
  {split(.$gene_symbol, .$gs_name)}

# GSEA
r <- fgsea(pathways = p, stats = s, eps = 0.0)

r %>%
  arrange(NES) %>% 
  mutate(pathway = fct_inorder(pathway)) %>% 
  ggplot(aes(x = pathway, y = NES)) + 
  geom_col(aes(fill = -log10(padj))) +
  scale_fill_viridis_c() +
  coord_flip() +
  labs(x = "Pathway",
       y = "Normalized enrichment score")

We can peek at the top hit

plotEnrichment(p[[arrange(r, padj)$pathway[1]]], s) +
  ggtitle(arrange(r, padj)$pathway[1])

Or as a table

arrange(r, padj) %>%
  dplyr::select(pathway, padj, ES, NES, size, leadingEdge) %>%
  mutate_at(c('pathway', 'padj', 'ES', 'NES'), function(x) {
    formatC(x, digits = 3, format = 'g')
  }) %>%
  reactable(
    searchable = T,
    highlight = T,
    wrap = F,
    resizable = T,
    striped = T,
    paginationType = "jump",
    showPageSizeOptions = T,
    defaultPageSize = 10,
    columns = list(
      leadingEdge = colDef(
        html = T,
        cell =  function(value, index, name) {
          value <- paste(value, collapse = ', ')
          div(
            style = "cursor: info;
                     white-space: nowrap;
                     overflow: hidden;
                     text-overflow: ellipsis;",
            tippy(text = value, tooltip = value)
          )
        }
      )
    )
  )